{
 "cells": [
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# Facebook"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Get the data"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "page_url = 'https://graph.facebook.com/v2.8/Google/feed?fields=id,message,reactions,shares,from,caption,created_time,likes.summary(true)' \n",
    "comments_url = 'https://graph.facebook.com/v2.8/{post_id}/comments?filter=stream&limit=100'\n",
    "\n",
    "from pymongo import MongoClient\n",
    "\n",
    "params = {'access_token': 'YOUR_ACCESS TOKEN'}\n",
    "client = MongoClient('localhost:27017')\n",
    "db = client.facebook\n",
    "collection_posts = db.posts\n",
    "collection_comments = db.comments\n",
    "\n",
    "posts = requests.get(page_url, params = params)\n",
    "posts = posts.json()\n",
    "\n",
    "while True:\n",
    "    try:\n",
    "    ###Retrieve one post\n",
    "        for element in posts['data']:\n",
    "            collection_posts.insert(element)\n",
    "            ####Retrieve all comments for this post\n",
    "            this_comment_url = comments_url.replace(\"{post_id}\",element['id'])\n",
    "            comments = requests.get(this_comment_url, params = params).json()\n",
    "            \n",
    "            #loop through all comments until the response is empty (there are no more comments)\n",
    "            while ('paging' in comments and 'cursors' in comments['paging'] and 'after' in comments['paging']['cursors']):\n",
    "                ###Iterate through all comments\n",
    "                for comment in comments['data']:\n",
    "                    comment['post_id'] = element['id']\n",
    "                    collection_comments.insert(comment)\n",
    "                \n",
    "                comments = requests.get(this_comment_url + '&after=' + comments['paging']['cursors']['after'], params = params).json()\n",
    " \n",
    "        ####Go to the next page in feed\n",
    "        posts = requests.get(posts['paging']['next']).json()\n",
    "    except (KeyError, e):\n",
    "        print(e)\n",
    "        break"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Pull the data from MongoDB"
   ]
  },
  {
   "cell_type": "raw",
   "metadata": {},
   "source": [
    "posts_data = []\n",
    "comments_data = []\n",
    "\n",
    "for doc in collection_posts.find({}):\n",
    "    try:\n",
    "      posts_data.append((doc['message'],doc['created_time'],doc['likes']['summary']['total_count'],doc['shares']['count'],doc['id']))\n",
    "    except:\n",
    "        #If there is no textual data in message field print \"no message\" (optional)\n",
    "        #print(\"No message\")\n",
    "        pass\n",
    "\n",
    "for comment in collection_comments.find({}):\n",
    "    try:\n",
    "        comments_data.append((comment['message'],comment['created_time'],comment['post_id']))\n",
    "    except:\n",
    "        pass\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 20,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "import pandas as pd\n",
    "import nltk\n",
    "\n",
    "#create dataframes\n",
    "df_posts = pd.DataFrame(posts_data)\n",
    "df_posts.columns = ['message','created_time','likes','shares','post_id']\n",
    "df_comments = pd.DataFrame(comments_data)\n",
    "df_comments.columns = ['message','created_time','post_id']"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Pre-process the data"
   ]
  },
  {
   "cell_type": "raw",
   "metadata": {},
   "source": [
    "import re\n",
    "import nltk\n",
    "\n",
    "def preprocess(text):\n",
    " \n",
    "    #Basic cleaning\n",
    "    text = text.strip()\n",
    "    text = re.sub(r'[^\\w\\s]','',text)\n",
    "    text = text.lower()\n",
    " \n",
    "    #Tokenize single comment:\n",
    "    tokens = nltk.word_tokenize(text)\n",
    " \n",
    "    return(tokens)\n",
    "\n",
    "def get_hashtags(text):\n",
    "    hashtags = re.findall(r\"#(\\w+)\", text)\n",
    "    return(hashtags)\n",
    "\n",
    "def tag_tokens(preprocessed_tokens):\n",
    "    pos = nltk.pos_tag(preprocessed_tokens)\n",
    "    return(pos)\n",
    "\n",
    "def get_keywords(tagged_tokens,pos='all'):\n",
    " \n",
    "    if(pos == 'all'):\n",
    "        lst_pos = ('NN','JJ','VB')\n",
    "    elif(pos == 'nouns'):\n",
    "        lst_pos = 'NN'\n",
    "    elif(pos == 'verbs'):\n",
    "        lst_pos = 'VB'\n",
    "    elif(pos == 'adjectives'):\n",
    "        lst_pos = 'JJ'\n",
    "    else:\n",
    "        lst_pos = ('NN','JJ','VB')\n",
    " \n",
    "    keywords = [tup[0] for tup in tagged_tokens if tup[1].startswith(lst_pos)]\n",
    " \n",
    "    return(keywords)\n",
    "\n",
    "def get_noun_phrases(tagged_tokens):\n",
    " \n",
    "    grammar = \"NP: {<DT>?<JJ>*<NN>}\"\n",
    "    cp = nltk.RegexpParser(grammar)\n",
    "    tree = cp.parse(tagged_tokens)\n",
    " \n",
    "    result = []\n",
    "    for subtree in tree.subtrees(filter=lambda t: t.label() == 'NP'):\n",
    "        ###We only take phrases not single words\n",
    "        if(len(subtree.leaves())>1):\n",
    "            outputs = [tup[0] for tup in subtree.leaves()]\n",
    "            outputs = \" \".join(outputs)\n",
    "            result.append(outputs)\n",
    " \n",
    "    return(result)\n",
    "\n",
    "def execute_pipeline(dataframe):\n",
    "    # #Get hashtags\n",
    "    dataframe['hashtags'] = dataframe.apply(lambda x: get_hashtags(x['message']),axis=1)\n",
    "    #Pre-process\n",
    "    dataframe['preprocessed'] = dataframe.apply(lambda x: preprocess(x['message']),axis=1)\n",
    "    #Extract pos\n",
    "    dataframe['tagged'] = dataframe.apply(lambda x: tag_tokens(x['preprocessed']),axis=1)\n",
    "    #Extract keywords\n",
    "    dataframe['keywords'] = dataframe.apply(lambda x: get_keywords(x['tagged'],'all'),axis=1)\n",
    "    #Extract noun_phrases\n",
    "    dataframe['noun_phrases'] = dataframe.apply(lambda x: get_noun_phrases(x['tagged']),axis=1)\n",
    " \n",
    "    return(dataframe)\n",
    " \n",
    "df_posts = execute_pipeline(df_posts)\n",
    "df_comments = execute_pipeline(df_comments)\n",
    "\n"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Visualization"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "from wordcloud import WordCloud\n",
    "import datetime\n",
    "import itertools\n",
    "import matplotlib.pyplot as plt\n",
    "\n",
    "def viz_wordcloud(dataframe,column_name):\n",
    " \n",
    "    #Count words or phrases\n",
    " \n",
    "    lst_tokens = list(itertools.chain.from_iterable(dataframe[column_name]))\n",
    "    lst_phrases = [phrase.replace(\" \",\"_\") for phrase in lst_tokens]\n",
    "    \n",
    "    #Define your font_path according to the operating system paths\n",
    "    wordcloud = WordCloud(font_path='/Library/Fonts/Verdana.ttf',background_color=\"white\", max_words=2000, max_font_size=40, random_state=42).generate(\" \".join(lst_phrases))\n",
    " \n",
    "    # Display the generated image:\n",
    "    # the matplotlib way:\n",
    "    plt.figure()\n",
    "    plt.imshow(wordcloud)\n",
    "    plt.axis(\"off\")\n",
    "    plt.show()\n"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Cleaning"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 63,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "CLEANING_LST = ['gulf','d','ban','persic']\n",
    "\n",
    "#lst_phrases = [phrase for phrase in lst_phrases if not any(spam in phrase.lower() for spam in CLEANING_LST)]\n",
    "#lst_phrases = [phrase.replace(\" \",\"_\") for phrase in lst_phrases if len(phrase) > 1 ]\n",
    "\n",
    "def viz_wordcloud(dataframe,column_name):\n",
    " \n",
    "    #Count words or phrases\n",
    "    lst_tokens = list(itertools.chain.from_iterable(dataframe[column_name]))\n",
    "    lst_phrases = [phrase.replace(\" \",\"_\") for phrase in lst_tokens if len(phrase) > 1 ]\n",
    "    lst_phrases = [phrase for phrase in lst_phrases if not any(spam in phrase.lower() for spam in CLEANING_LST)]\n",
    "    \n",
    "   \n",
    "    \n",
    "    #Define your font_path according to the operating system paths\n",
    "    wordcloud = WordCloud(font_path='/Library/Fonts/Verdana.ttf',background_color=\"white\", max_words=2000, max_font_size=40, random_state=42).generate(\" \".join(lst_phrases))\n",
    " \n",
    "    # Display the generated image:\n",
    "    # the matplotlib way:\n",
    "    plt.figure()\n",
    "    plt.imshow(wordcloud)\n",
    "    plt.axis(\"off\")\n",
    "    plt.show()\n",
    "\n"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Print results"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 22,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "HurrayI luv <3 google\n",
      "now this wat i'm talking about.. It's high time google got a page of their own in here !\n",
      "Www.google.com\n",
      "i love gothgoogle!! lol!!!! its a real website!!\n",
      "I'm addicted to google. it makes me smarter.\n"
     ]
    }
   ],
   "source": [
    "# Function to print verbatims\n",
    "\n",
    "def print_verbatims(df,nb_verbatim,keyword):\n",
    " \n",
    "    verbatims = df[df['message'].str.contains(keyword)]\n",
    "    \n",
    "    for i,text in verbatims.head(nb_verbatim).iterrows():\n",
    "        print(text['message'])\n",
    "\n",
    "print_verbatims(df_comments,5,'google')"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Time series analysis"
   ]
  },
  {
   "cell_type": "raw",
   "metadata": {},
   "source": [
    "import datetime\n",
    "from ggplot import *\n",
    "\n",
    "df_comments['date'] = df_comments['created_time'].apply(pd.to_datetime)\n",
    "\n",
    "df_comments_ts = df_comments.set_index(['date'])\n",
    "\n",
    "df_comments_ts = df_comments_ts['2015-01-01':]\n",
    "\n",
    "df_posts['date'] = df_posts['created_time'].apply(pd.to_datetime)\n",
    "df_posts_ts = df_posts.set_index(['date'])\n",
    "df_posts_ts = df_posts_ts['2015-01-01':]\n",
    "\n",
    "dx = df_posts_ts.resample('W').mean()\n",
    "dx.index.name = 'date'\n",
    "dx = dx.reset_index()\n",
    "\n",
    "p = ggplot(dx, aes(x='date', y = 'likes')) + geom_line()\n",
    "p = p + xlab(\"Date\") + ylab(\"Number of likes\") + ggtitle(\"Facebok Google Page\")\n",
    "\n",
    "print(p)\n",
    "\n",
    "p = ggplot(dx, aes(x='date', y = 'shares')) + geom_line()\n",
    "p = p + xlab(\"Date\") + ylab(\"Number of shares\") + ggtitle(\"Facebok Google Page\")\n",
    "\n",
    "print(p)\n"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Wordcloud on time series"
   ]
  },
  {
   "cell_type": "raw",
   "metadata": {},
   "source": [
    "def max_wordcloud(ts_df_posts,ts_df_comments,columnname,criterium='shares'):\n",
    "    #Firstly, the function computes an average number of shares/likes per week.\n",
    "    mean_week = ts_df_posts.resample('W').mean()\n",
    "\n",
    "    #Then, it searches for the first day and last day of the global peak on posts time series.\n",
    "    start_week = (mean_week[criterium].idxmax() - datetime.timedelta(days=7)).strftime('%Y-%m-%d')\n",
    "    end_week = mean_week['shares'].idxmax().strftime('%Y-%m-%d')\n",
    "    \n",
    "    #It creates wordclouds with previously defined function.\n",
    "    viz_wordcloud(ts_df_posts[start_week:end_week],columnname)\n",
    "    viz_wordcloud(ts_df_comments[start_week:end_week],columnname)\n",
    "    \n",
    "max_wordcloud(df_posts_ts,df_comments_ts,'keywords')\n"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Alchemy API"
   ]
  },
  {
   "cell_type": "raw",
   "metadata": {},
   "source": [
    "import requests\n",
    "import json\n",
    " \n",
    "apikey = \"YOUR_KEY\"\n",
    " \n",
    "url = \"https://gateway-a.watsonplatform.net/calls/text/TextGetEmotion\"\n",
    " \n",
    "def get_alchemy(apikey,text):\n",
    "    pms = {'apikey' : apikey, 'outputMode' : 'json', \"text\" : text}\n",
    "    result = requests.post(url, data = pms)\n",
    "    try:\n",
    "        emotions = json.loads(result.text)['docEmotions']\n",
    "    except:\n",
    "        emotions = None\n",
    "    return(emotions)\n",
    "\n",
    "alchemy = df_comments.head(1000)\n",
    "\n",
    "alchemy['alchemy'] = alchemy.apply(lambda x: get_alchemy(apikey,x['message']),axis=1)\n",
    "\n",
    "emotions = []\n",
    " \n",
    "for p in alchemy['alchemy']:\n",
    "    if not p == None:\n",
    "        emotions.append(p)\n",
    "\n",
    "df_emotions = pd.DataFrame(emotions)\n",
    "\n",
    "df_emotions = df_emotions.apply(pd.to_numeric, errors='ignore')\n",
    "\n",
    "#Transpose the results\n",
    "df_emotions_means = df_emotions.transpose().apply(np.mean,axis=1)\n",
    "\n",
    "#Plot the results\n",
    "df_emotions_means.plot(kind='bar', legend=False)\n"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.6.1"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}
